### Importing the necessary data files
from google.colab import files
uploaded = files.upload()
### Creating pandas dataframes
import pandas as pd
data_1 = pd.read_csv('Data1.csv')
data_2 = pd.read_csv('Data2.csv')
data_3 = pd.read_csv('Data3.csv')
data_4 = pd.read_csv('Data4.csv')
data_5 = pd.read_csv('Data5.csv')
data_6 = pd.read_csv('Data6.csv')
data_7 = pd.read_csv('Data7.csv')
data_8 = pd.read_csv('Data8.csv')
# Previewing the table
data_1.head()
### 3 attributes (X1, X2, and X3) are available and the Class is provided
data_1.describe()
### Here class is the predicted variable and can be treated as categorical/nominal - Hence, aggregate stats don't make sense
### It can be noticed that the range of values for X1, X2 and X3 are similar and hence, no scaling is required
### As the count of rows for each column is 212, there are no missing values
### Checking the number of distinct classes
print(data_1['Class'].unique()) ### There are 7 classes. Hence, K = 7
### Making a copy of the data frame to work on
data_1_copy = data_1.copy()
### Storing the original classes of objects as label1
label1 = data_1["Class"]
# Importing required libraries
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline
km_data_1 = KMeans(n_clusters=7)
predicted_class_km = km_data_1.fit_predict(data_1_copy[['X1','X2','X3']])
predicted_class_km
### Adding the predicted class as a column in the data_1_copy df
data_1_copy['predicted_class_km'] = predicted_class_km
data_1_copy.head()
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage='ward')
predicted_class_h = cluster.fit_predict(data_1_copy[['X1','X2','X3']])
predicted_class_h
### Adding the predicted class as a column in the data_1_copy df
data_1_copy['predicted_class_h'] = predicted_class_h
data_1_copy.head()
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters = shc.linkage(data_1,
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()
### Writing a function to calculate TP, FP, TN, and FN along with fscore and accuracy of the model
def validation(cluster, labels):
# Setting all variables to 0
TP, TN, FP, FN = 0, 0, 0, 0
n = len(labels)
for i in range(n):
for j in range(i + 1, n):
# Creating a boolean variable that checks if a data pair belongs to the same original class
same_label = (labels[i] == labels[j])
# Creating a boolean variable that checks if the same data pair belongs to the same predicted class
same_cluster = (cluster[i] == cluster[j])
# TP: The number of data pairs found in the same cluster, both in predicted class (C) and in true class (P)
if same_cluster:
if same_label:
TP += 1
# FP: The number of data pairs found in the same cluster in C but in different clusters in P
else:
FP += 1
# FN: The number of data pairs found in different clusters in C but in the same cluster in P
elif same_label:
FN += 1
# TN: The number of data pairs found in different clusters, both in C and in P
else:
TN += 1
# Computing precision, recall, fscore, and accuracy
precision = TP / (TP + FP)
recall = TP / (TP + FN)
fscore = 2 * (precision*recall)/(precision + recall)
accuracy=(TP + TN)/(TP + FP + TN + FN)
print("F1-score:",fscore)
print("Accuracy:",accuracy)
validation(predicted_class_km, data_1['Class'])
validation(predicted_class_h, data_1['Class'])
### Plotting a 3D scatter plot with original class values
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 7 clusters separately by creating a df for each predicted class
df0_true = data_1_copy[data_1_copy['Class']==1]
df1_true = data_1_copy[data_1_copy['Class']==2]
df2_true = data_1_copy[data_1_copy['Class']==3]
df3_true = data_1_copy[data_1_copy['Class']==4]
df4_true = data_1_copy[data_1_copy['Class']==5]
df5_true = data_1_copy[data_1_copy['Class']==6]
df6_true = data_1_copy[data_1_copy['Class']==7]
ax.scatter(df0_true['X1'], df0_true['X2'], df0_true['X3'], color='green')
ax.scatter(df1_true['X1'], df1_true['X2'], df1_true['X3'], color='blue')
ax.scatter(df2_true['X1'], df2_true['X2'], df2_true['X3'], color='black')
ax.scatter(df3_true['X1'], df3_true['X2'], df3_true['X3'], color='orange')
ax.scatter(df4_true['X1'], df4_true['X2'], df4_true['X3'], color='indigo')
ax.scatter(df5_true['X1'], df5_true['X2'], df5_true['X3'], color='red')
ax.scatter(df6_true['X1'], df6_true['X2'], df6_true['X3'], color='violet')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 7 clusters separately by creating a df for each predicted class
df0 = data_1_copy[data_1_copy['predicted_class_km']==0]
df1 = data_1_copy[data_1_copy['predicted_class_km']==1]
df2 = data_1_copy[data_1_copy['predicted_class_km']==2]
df3 = data_1_copy[data_1_copy['predicted_class_km']==3]
df4 = data_1_copy[data_1_copy['predicted_class_km']==4]
df5 = data_1_copy[data_1_copy['predicted_class_km']==5]
df6 = data_1_copy[data_1_copy['predicted_class_km']==6]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
ax.scatter(df2['X1'], df2['X2'], df2['X3'], color='black')
ax.scatter(df3['X1'], df3['X2'], df3['X3'], color='orange')
ax.scatter(df4['X1'], df4['X2'], df4['X3'], color='indigo')
ax.scatter(df5['X1'], df5['X2'], df5['X3'], color='red')
ax.scatter(df6['X1'], df6['X2'], df6['X3'], color='violet')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 7 clusters separately by creating a df for each predicted class
df0 = data_1_copy[data_1_copy['predicted_class_h']==0]
df1 = data_1_copy[data_1_copy['predicted_class_h']==1]
df2 = data_1_copy[data_1_copy['predicted_class_h']==2]
df3 = data_1_copy[data_1_copy['predicted_class_h']==3]
df4 = data_1_copy[data_1_copy['predicted_class_h']==4]
df5 = data_1_copy[data_1_copy['predicted_class_h']==5]
df6 = data_1_copy[data_1_copy['predicted_class_h']==6]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
ax.scatter(df2['X1'], df2['X2'], df2['X3'], color='black')
ax.scatter(df3['X1'], df3['X2'], df3['X3'], color='orange')
ax.scatter(df4['X1'], df4['X2'], df4['X3'], color='indigo')
ax.scatter(df5['X1'], df5['X2'], df5['X3'], color='red')
ax.scatter(df6['X1'], df6['X2'], df6['X3'], color='violet')
plt.show()
data_2.describe()
### As the count of rows for each column is 404, there are no missing values
data_2_copy = data_2.copy()
data_2_copy.groupby(data_2_copy['Class'], as_index=False)['Unnamed: 0'].count()
### As there are 4 classes, K=4
km_data_2 = KMeans(n_clusters=4)
predicted_class_km2 = km_data_2.fit_predict(data_2_copy[['X','Y','C']])
predicted_class_km2
data_2_copy['predicted_class_km2'] = predicted_class_km2
data_2_copy.head()
cluster2 = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
predicted_class_h2 = cluster2.fit_predict(data_2_copy[['X','Y','C']])
predicted_class_h2
data_2_copy['predicted_class_h2'] = predicted_class_h2
data_2_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters2 = shc.linkage(data_2_copy[['X','Y','C']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters2)
plt.show()
validation(predicted_class_km2,data_2_copy['Class'])
validation(predicted_class_h2,data_2_copy['Class'])
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 7 clusters separately by creating a df for each predicted class
df0 = data_2_copy[data_2_copy['Class']==1]
df1 = data_2_copy[data_2_copy['Class']==2]
df2 = data_2_copy[data_2_copy['Class']==3]
df3 = data_2_copy[data_2_copy['Class']==4]
ax.scatter(df0['X'], df0['Y'], df0['C'], color='green')
ax.scatter(df1['X'], df1['Y'], df1['C'], color='blue')
ax.scatter(df2['X'], df2['Y'], df2['C'], color='black')
ax.scatter(df3['X'], df3['Y'], df3['C'], color='orange')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(13,12))
ax = fig.add_subplot(projection='3d')
df0 = data_2_copy[data_2_copy['predicted_class_km2']==0]
df1 = data_2_copy[data_2_copy['predicted_class_km2']==1]
df2 = data_2_copy[data_2_copy['predicted_class_km2']==2]
df3 = data_2_copy[data_2_copy['predicted_class_km2']==3]
ax.scatter(df0['X'], df0['Y'], df0['C'], color='green')
ax.scatter(df1['X'], df1['Y'], df1['C'], color='blue')
ax.scatter(df2['X'], df2['Y'], df2['C'], color='black')
ax.scatter(df3['X'], df3['Y'], df3['C'], color='orange')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(13,12))
ax = fig.add_subplot(projection='3d')
df0 = data_2_copy[data_2_copy['predicted_class_h2']==0]
df1 = data_2_copy[data_2_copy['predicted_class_h2']==1]
df2 = data_2_copy[data_2_copy['predicted_class_h2']==2]
df3 = data_2_copy[data_2_copy['predicted_class_h2']==3]
ax.scatter(df0['X'], df0['Y'], df0['C'], color='green')
ax.scatter(df1['X'], df1['Y'], df1['C'], color='blue')
ax.scatter(df2['X'], df2['Y'], df2['C'], color='black')
ax.scatter(df3['X'], df3['Y'], df3['C'], color='orange')
plt.show()
data_3.describe()
### It can be noticed that the range of values for X1, X2 and X3 are similar and hence, no scaling is required
### As the count of rows for each column is 400, there are no missing values
print(data_3['Class'].unique())
### K=4
data_3_copy = data_3.copy()
data_3_copy.groupby(data_3_copy['Class'], as_index=False)['Unnamed: 0'].count()
km_data_3 = KMeans(n_clusters=4)
predicted_class_km3 = km_data_3.fit_predict(data_3_copy[['X1','X2','X3']])
predicted_class_km3
data_3_copy['predicted_class_km3'] = predicted_class_km3
data_3_copy.head()
cluster3 = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
predicted_class_h3 = cluster3.fit_predict(data_3_copy[['X1','X2','X3']])
predicted_class_h3
data_3_copy['predicted_class_h3'] = predicted_class_h3
data_3_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters3 = shc.linkage(data_3_copy[['X1','X2','X3']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters3)
plt.show()
validation(predicted_class_km3,data_3_copy['Class'])
validation(predicted_class_h3,data_3_copy['Class'])
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 7 clusters separately by creating a df for each predicted class
df0 = data_3_copy[data_3_copy['Class']==1]
df1 = data_3_copy[data_3_copy['Class']==2]
df2 = data_3_copy[data_3_copy['Class']==3]
df3 = data_3_copy[data_3_copy['Class']==4]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
ax.scatter(df2['X1'], df2['X2'], df2['X3'], color='black')
ax.scatter(df3['X1'], df3['X2'], df3['X3'], color='orange')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
df0 = data_3_copy[data_3_copy['predicted_class_km3']==0]
df1 = data_3_copy[data_3_copy['predicted_class_km3']==1]
df2 = data_3_copy[data_3_copy['predicted_class_km3']==2]
df3 = data_3_copy[data_3_copy['predicted_class_km3']==3]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
ax.scatter(df2['X1'], df2['X2'], df2['X3'], color='black')
ax.scatter(df3['X1'], df3['X2'], df3['X3'], color='orange')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
df0 = data_3_copy[data_3_copy['predicted_class_h3']==0]
df1 = data_3_copy[data_3_copy['predicted_class_h3']==1]
df2 = data_3_copy[data_3_copy['predicted_class_h3']==2]
df3 = data_3_copy[data_3_copy['predicted_class_h3']==3]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
ax.scatter(df2['X1'], df2['X2'], df2['X3'], color='black')
ax.scatter(df3['X1'], df3['X2'], df3['X3'], color='orange')
plt.show()
data_4.head()
data_4.describe()
### As the count of rows for each column is 1000, there are no missing values
print(data_4['Class'].unique())
### K=2
data_4_copy = data_4.copy()
data_4_copy.groupby(data_4_copy['Class'], as_index=False)['Unnamed: 0'].count()
km_data_4 = KMeans(n_clusters=2)
predicted_class_km4 = km_data_4.fit_predict(data_4_copy[['X1','X2','X3']])
predicted_class_km4
### Adding the predicted class as a column in the data_1_copy df
data_4_copy['predicted_class_km4'] = predicted_class_km4
data_4_copy.head()
cluster4 = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
predicted_class_h4 = cluster4.fit_predict(data_4_copy[['X1','X2','X3']])
predicted_class_h4
data_4_copy['predicted_class_h4'] = predicted_class_h4
data_4_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters4 = shc.linkage(data_4_copy[['X1','X2','X3']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters4)
plt.show()
validation(predicted_class_km4,data_4_copy['Class'])
validation(predicted_class_h4,data_4_copy['Class'])
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
df0 = data_4_copy[data_4_copy['Class']==1]
df1 = data_4_copy[data_4_copy['Class']==2]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
df0 = data_4_copy[data_4_copy['predicted_class_km4']==0]
df1 = data_4_copy[data_4_copy['predicted_class_km4']==1]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
df0 = data_4_copy[data_4_copy['predicted_class_h4']==0]
df1 = data_4_copy[data_4_copy['predicted_class_h4']==1]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
data_5.head()
data_5.describe()
print(data_5['Class'].unique())
### K=2
data_5_copy = data_5.copy()
data_5_copy.groupby(data_5_copy['Class'], as_index=False)['Unnamed: 0'].count()
km_data_5 = KMeans(n_clusters=2)
predicted_class_km5 = km_data_5.fit_predict(data_5_copy[['X1','X2','X3']])
predicted_class_km5
data_5_copy['predicted_class_km5'] = predicted_class_km5
data_5_copy.head()
cluster5 = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
predicted_class_h5 = cluster5.fit_predict(data_5_copy[['X1','X2','X3']])
predicted_class_h5
data_5_copy['predicted_class_h5'] = predicted_class_h5
data_5_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters5 = shc.linkage(data_5_copy[['X1','X2','X3']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters5)
plt.show()
validation(predicted_class_km5,data_5_copy['Class'])
validation(predicted_class_h5,data_5_copy['Class'])
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 2 clusters separately by creating a df for each predicted class
df0 = data_5_copy[data_5_copy['Class']==1]
df1 = data_5_copy[data_5_copy['Class']==2]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 2 clusters separately by creating a df for each predicted class
df0 = data_5_copy[data_5_copy['predicted_class_km5']==0]
df1 = data_5_copy[data_5_copy['predicted_class_km5']==1]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
# Plotting the 2 clusters separately by creating a df for each predicted class
df0 = data_5_copy[data_5_copy['predicted_class_h5']==0]
df1 = data_5_copy[data_5_copy['predicted_class_h5']==1]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='green')
ax.scatter(df1['X1'], df1['X2'], df1['X3'], color='blue')
plt.show()
data_6.head()
data_6.describe()
print(data_6['Class'].unique())
### K=2
data_6_copy = data_6.copy()
data_6_copy.groupby(data_6_copy['Class'], as_index=False)['Unnamed: 0'].count()
km_data_6 = KMeans(n_clusters=2)
predicted_class_km6 = km_data_6.fit_predict(data_6_copy[['X1','X2']])
predicted_class_km6
data_6_copy['predicted_class_km6'] = predicted_class_km6
data_6_copy.head()
cluster6 = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
predicted_class_h6 = cluster6.fit_predict(data_6_copy[['X1','X2']])
predicted_class_h6
data_6_copy['predicted_class_h6'] = predicted_class_h6
data_6_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters4 = shc.linkage(data_4_copy[['X1','X2','X3']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters4)
plt.show()
validation(predicted_class_km6,data_6_copy['Class'])
validation(predicted_class_h6,data_6_copy['Class'])
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='rectilinear')
df0 = data_6_copy[data_6_copy['Class']==1]
df1 = data_6_copy[data_6_copy['Class']==2]
ax.scatter(df0['X1'], df0['X2'], color='blue')
ax.scatter(df1['X1'], df1['X2'], color='green')
plt.show()
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='rectilinear')
df0 = data_6_copy[data_6_copy['predicted_class_km6']==0]
df1 = data_6_copy[data_6_copy['predicted_class_km6']==1]
ax.scatter(df0['X1'], df0['X2'], color='green')
ax.scatter(df1['X1'], df1['X2'], color='blue')
plt.show()
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='rectilinear')
df0 = data_6_copy[data_6_copy['predicted_class_h6']==0]
df1 = data_6_copy[data_6_copy['predicted_class_h6']==1]
ax.scatter(df0['X1'], df0['X2'], color='blue')
ax.scatter(df1['X1'], df1['X2'], color='green')
plt.show()
data_7.head()
data_7.describe()
print(data_7['Class'].unique())
### K=6
data_7_copy = data_7.copy()
data_7_copy.groupby(data_7_copy['Class'], as_index=False)['Unnamed: 0'].count()
km_data_7 = KMeans(n_clusters=6)
predicted_class_km7 = km_data_7.fit_predict(data_7_copy[['X1','X2']])
predicted_class_km7
data_7_copy['predicted_class_km7'] = predicted_class_km7
data_7_copy.head()
cluster7 = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='ward')
predicted_class_h7 = cluster7.fit_predict(data_7_copy[['X1','X2']])
predicted_class_h7
data_7_copy['predicted_class_h7'] = predicted_class_h7
data_7_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters7 = shc.linkage(data_7_copy[['X1','X2']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters7)
plt.show()
validation(predicted_class_km7,data_7_copy['Class'])
validation(predicted_class_h7,data_7_copy['Class'])
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='rectilinear')
df0 = data_7_copy[data_7_copy['Class']==1]
df1 = data_7_copy[data_7_copy['Class']==2]
df2 = data_7_copy[data_7_copy['Class']==3]
df3 = data_7_copy[data_7_copy['Class']==4]
df4 = data_7_copy[data_7_copy['Class']==5]
df5 = data_7_copy[data_7_copy['Class']==6]
ax.scatter(df0['X1'], df0['X2'], color='green')
ax.scatter(df1['X1'], df1['X2'], color='blue')
ax.scatter(df2['X1'], df2['X2'], color='orange')
ax.scatter(df3['X1'], df3['X2'], color='pink')
ax.scatter(df4['X1'], df4['X2'], color='yellow')
ax.scatter(df5['X1'], df5['X2'], color='violet')
plt.show()
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='rectilinear')
df0 = data_7_copy[data_7_copy['predicted_class_km7']==0]
df1 = data_7_copy[data_7_copy['predicted_class_km7']==1]
df2 = data_7_copy[data_7_copy['predicted_class_km7']==2]
df3 = data_7_copy[data_7_copy['predicted_class_km7']==3]
df4 = data_7_copy[data_7_copy['predicted_class_km7']==4]
df5 = data_7_copy[data_7_copy['predicted_class_km7']==5]
ax.scatter(df0['X1'], df0['X2'], color='green')
ax.scatter(df1['X1'], df1['X2'], color='blue')
ax.scatter(df2['X1'], df2['X2'], color='orange')
ax.scatter(df3['X1'], df3['X2'], color='pink')
ax.scatter(df4['X1'], df4['X2'], color='yellow')
ax.scatter(df5['X1'], df5['X2'], color='violet')
plt.show()
### Plotting a 3D scatter plot
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='rectilinear')
# Plotting the 6 clusters separately by creating a df for each predicted class
df0 = data_7_copy[data_7_copy['predicted_class_h7']==0]
df1 = data_7_copy[data_7_copy['predicted_class_h7']==1]
df2 = data_7_copy[data_7_copy['predicted_class_h7']==2]
df3 = data_7_copy[data_7_copy['predicted_class_h7']==3]
df4 = data_7_copy[data_7_copy['predicted_class_h7']==4]
df5 = data_7_copy[data_7_copy['predicted_class_h7']==5]
ax.scatter(df0['X1'], df0['X2'], color='green')
ax.scatter(df1['X1'], df1['X2'], color='blue')
ax.scatter(df2['X1'], df2['X2'], color='orange')
ax.scatter(df3['X1'], df3['X2'], color='pink')
ax.scatter(df4['X1'], df4['X2'], color='yellow')
ax.scatter(df5['X1'], df5['X2'], color='violet')
plt.show()
data_8.head()
data_8.describe()
print(data_8['Class'].unique())
### K=1
data_8_copy = data_8.copy()
data_8_copy.groupby(data_8_copy['Class'], as_index=False)['Unnamed: 0'].count()
km_data_8 = KMeans(n_clusters=1)
predicted_class_km8 = km_data_8.fit_predict(data_8_copy[['X1','X2','X3']])
predicted_class_km8
data_8_copy['predicted_class_km8'] = predicted_class_km8
data_8_copy.head()
cluster8 = AgglomerativeClustering(n_clusters=1, affinity='euclidean', linkage='ward')
predicted_class_h8 = cluster8.fit_predict(data_8_copy[['X1','X2','X3']])
predicted_class_h8
data_8_copy['predicted_class_h8'] = predicted_class_h8
data_8_copy.head()
plt.figure(figsize=(10, 7))
plt.title("Dendrogram")
clusters8 = shc.linkage(data_8_copy[['X1','X2','X3']],
method='ward',
metric="euclidean")
shc.dendrogram(Z=clusters8)
plt.show()
As the number of clusters is 1 according to the external information, there is no need to cluster the data. From the graphs, it can be seen that the distribution is homogenous.
Hency we can expect an accuracy of 100%
validation(predicted_class_km8,data_8_copy['Class'])
validation(predicted_class_h8,data_8_copy['Class'])
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='3d')
df0 = data_8_copy[data_8_copy['Class']==1]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='blue')
plt.show()
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='3d')
df0 = data_8_copy[data_8_copy['predicted_class_km8']==0]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='blue')
plt.show()
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='3d')
df0 = data_8_copy[data_8_copy['predicted_class_h8']==0]
ax.scatter(df0['X1'], df0['X2'], df0['X3'], color='blue')
plt.show()
# !jupyter nbconvert --to html Assignment_2_Niresh_Subramanian.ipynb